1 Preparatory steps for the Markdown file

Setting the options for knitr

library(knitr)
knitr::opts_chunk$set(results = 'asis',
                       echo = TRUE,
                       comment = NA,
                       prompt = FALSE,
                       cache = FALSE,
                       warning = FALSE,
                       message = FALSE,
                       fig.align = "center",
                       fig.width = 8.125,
                       out.width = "100%",
                       fig.path = "Figures/figures_pdfa/",
                       dev = c('png', 'tiff'),
                       dev.args = list(png = list(type = "cairo"), tiff = list(compression = 'lzw')),
                       cache = TRUE,
                       cache.path = "D:/cache/cache_pdfa/",
                       autodep = TRUE)
options(width = 1000, scipen = 999, knitr.kable.NA = '')

Loading libraries…

library(dplyr)
library(purrr)
library(forcats)
library(MASS)
library(ggplot2)
library(caret)
library(kableExtra)
library(pROC)
library(ggrepel)
library(GGally)
library(splitstackshape)
library(fmsb)

2 Defining useful functions

Functions to reorder a factor for proper display (Source: https://github.com/dgrtwo/drlib/blob/master/R/reorder_within.R)

reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
new_x <- paste(x, within, sep = sep)
stats::reorder(new_x, by, FUN = fun)}

scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)}

Function to compute VIF and assess multicolinearity… (Source: https://beckmw.wordpress.com/2013/02/05/collinearity-and-stepwise-vif-selection/)

vif_func <- function(in_frame,thresh=10,trace=T,...){

  if(any(!'data.frame' %in% class(in_frame))) in_frame<-data.frame(in_frame)
  
  # Getting the initial vif values for all the comparisons of the variables
  vif_init<-NULL
  var_names <- names(in_frame)
  for(val in var_names){
      regressors <- var_names[-which(var_names == val)]
      form <- paste(regressors, collapse = '+')
      form_in <- formula(paste(val, '~', form))
      vif_init<-rbind(vif_init, c(val, VIF(lm(form_in, data = in_frame, ...))))
  }
  
  # Computing the maximum vif value
  vif_max<-max(as.numeric(vif_init[,2]), na.rm = TRUE)

  # Testing whether we have VIF values larger than our threshold
  if(vif_max < thresh){
    # Printing the output of the first iteration if trace is TRUE
    if(trace) {
      prmatrix(vif_init,collab=c('var','vif'),rowlab=rep('',nrow(vif_init)),quote=F)
      cat('\n')
      cat(paste('All variables have VIF < ', thresh,', max VIF ',round(vif_max,2), sep=''),'\n\n')
    }
    return(var_names)
  }
  else {
    in_dat <- in_frame

    # We perform a backward selection of the explanatory variables, and stop when all VIF values are below 'thresh'
    while(vif_max >= thresh){
      vif_vals<-NULL
      var_names <- names(in_dat)
        
      for(val in var_names){
        regressors <- var_names[-which(var_names == val)]
        form <- paste(regressors, collapse = '+')
        form_in <- formula(paste(val, '~', form))
        vif_add<-VIF(lm(form_in, data = in_dat, ...))
        vif_vals<-rbind(vif_vals,c(val,vif_add))
      }

      # Computing the maximum vif value
      max_row <- which(vif_vals[,2] == max(as.numeric(vif_vals[,2]), na.rm = TRUE))[1]
      vif_max <- as.numeric(vif_vals[max_row,2])

      # If there no longer VIF values larger than our threshold, we stop the procedure
      if(vif_max < thresh) break
      
      # Printing the output of the current iteration if trace is TRUE
      if(trace) {
        prmatrix(vif_vals,collab=c('var','vif'),rowlab=rep('',nrow(vif_vals)),quote=F)
        cat('\n')
        cat('removed: ',vif_vals[max_row,1],vif_max,'\n\n')
        flush.console()
      }

      in_dat <- in_dat[,!names(in_dat) %in% vif_vals[max_row,1]]
    }
    
    return(names(in_dat))
  }
}

Function to return a ggplot figure displaying a confusion matrix…

ggplotConfusionMatrix <- function(m){
  caption <- paste(
                  "Log Loss", round(resultats[[DV]][[set_name]]$Log.Loss, 4),
                  "Kappa:", round(resultats[[DV]][[set_name]]$Kappa, 4),
                  "Multi-class AUC:", round(resultats[[DV]][[set_name]]$AUC, 4),
                  "Acc.:", round(resultats[[DV]][[set_name]]$Acc, 4),
                  "Bal. Acc.:", round(resultats[[DV]][[set_name]]$Bal.Acc, 4),
                  sep = " - ")
  
  p <- m$table %>%
    data.frame() %>%
    group_by(observed) %>%
    mutate(
      total = sum(Freq),
      frac =  (Freq / total)*100
      ) %>%
    ggplot(aes(predicted, observed)) +
    geom_tile(aes(fill = Freq), colour = "white", show.legend = FALSE) +
    scale_fill_distiller(palette = "YlOrBr", direction = 1) +
    geom_text(aes(label = Freq ), vjust = 1, size=3) +
    theme(legend.position = "none") +
    labs(title = "Confusion Matrix (100 iterations - Testing set)",
         subtitle = paste(DV, set_name, sep=" - "),
         caption = caption
    ) +
    theme_classic()
  return(p)
}

3 Loading R objects previously defined

We load data tables and definitions of features sets prepared during the curation phase…

load(file = "Study_objects.RData")

4 Defining the key variables

Defining the predicted variables (individual, type…)…

DVs <- c("individual", "type")

5 Choosing the feature sets

We have previously defined three feature sets: Bioacoustics, DCT and MFCC. For the pDFA, we are going to keep only the first two:

  • Bioacoustic : a set of “classical” bioacoustic parameters in the literature on animal communication
  • DCT : Duration + HNR + DCT 0 + DCT 1 + DCT 2 + DCT 3 + DCT 4

The MFCC predictor set is not used with pDFAs because of its high dimensionality and high collinearity that inadequately meet the assumptions of discriminant analysis (the size of the smallest class must be larger than the number of predictors)

set_names <- c("Bioacoustic", "DCT")

We create a (named) list of these sets…

SETs <- list(Bioacoustic_set, DCT_set)

6 Dealing with multicollinearity: VIF

The discriminant analysis does not automatically handle multicollinearity between predictors, which should be explicitly addressed.

A way to identify collinearity is to compute a variance inflation factor (VIF) for each explanatory variable. If we regress one explanatory variable by other explanatory variables, we obtain the coefficient of determination \(R^2\). VIF is defined by the following equation: \(VIF=1/(1-R^2)\).

If no explanatory variables are collinear, the VIF values will be all around 1. If the VIF is greater than 1, the predictors may be moderately collinear. A VIF between 5 and 10 indicates high collinearities that may be problematic. And if the VIF goes above 10, you could assume that the regression coefficients are poorly estimated due to multicollinearity.

“Montgomery & Peck (1992) used a value of 10, but a more stringent approach is to use values as low as 3 as we did here. High, or even moderate, collinearity is especially problematic when ecological signals are weak. In that case, even a VIF of 2 may cause non-significant parameter estimates, compared to the situation without collinearity.” — Zuur et al. (2010: 9)

As suggested by Zuur et al. (2010: 9), we use a stepwise strategy. The VIF values for all explanatory variables are computed first. If at least one of the VIF value is larger than a preselected threshold (here, 5), we drop the explanatory variable with the largest VIF. We then recalculate the VIFs and repeat the process until all VIFs are below the preselected threshold.

This custom stepwise VIF function has been built by Marcus W. Beck.

We reduce our Bioacoustic and DCT sets

df.set2 <- df %>%
    dplyr::select(all_of(SETs[[1]]))
    
Bioacoustic_set <- vif_func(in_frame = df.set2, thresh = 5, trace = F)

df.set2 <- df %>%
    dplyr::select(all_of(SETs[[2]]))
    
DCT_set <- vif_func(in_frame = df.set2, thresh = 5, trace = F)

We update our SETs object with the reduced Bioacoustic and DCT feature sets

SETs <- list(Bioacoustic_set, DCT_set)

6.1 Displaying the Bioacoustic set after VIF computing

knitr::combine_words(Bioacoustic_set)

duration, vocalization.HNR, q1f, q3f, f.max, q1t, q3t, t.max, f0.end, f0.slope.asc, and f0.slope.desc

6.2 Displaying the DCT set after VIF computing

knitr::combine_words(DCT_set)

duration, vocalization.HNR, dct0, dct1, dct2, dct3, and dct4

7 A Permuted Discriminant Analysis (pDFA)

In order to analyze the information conveyed by the individual signature as well as the call types, we implemented a permuted discriminant analysis (pDFA) inspired by Mundry and Sommer (2007) and Keenan et al. (2020), which directly aimed to control the aforementioned imbalance in our data.

7.1 Repeated pDFA to compute average performances

“In the first step of the DFA, a training data set was used to generate a set of linear discriminant functions. The training data set consisted of randomly selected sounds from each individual. The number of sounds selected per individual was the same for all individuals and equal to two-thirds of the smallest number of sounds that we obtained for an animal in our data set. In the second step, the discriminant functions generated from the training data set were used to classify the remaining sounds. For each individual, at least one-third of the sound provided by each individual was thus included in the validating data set. This cross-validation step gives a measure of the effect size (the percentage of correctly classified sounds, which has to be compared with chance, here 20%, i.e. 1/5 possible call types). We ran 100 iterations of these two-step DFAs with both training and validation data sets chosen at random. The mean effect size (mean percentage of correctly classified sounds) was obtained by calculating the average of the percentages of correctly classified sounds obtained with each of the 100 validation data sets.” — Keenan et al. (2020)

In the first step of this permuted form of discriminant analysis, a training sample was used to compute a set of linear discriminant functions. In order to test the call type distinctiveness, the training sample was randomly selected from each call type without controlling for the individual. Conversely, in order to test the individual vocal signature, the training sample was randomly selected from each individual’s vocalizations. The number of occurrences selected was the same for each call type and each individual. This number was equal to two thirds of the smallest number of vocalizations analyzed per call type (i.e., 206×2/3) and per individual (i.e., 71×2/3). These discriminant analysis models computed from training samples were used to classify all the remaining occurrences. Thus, for each individual and for each type of vocalization, at least one third of the occurrences were included in the test sample. We performed 100 iterations of these discriminant analyses with randomly selected training and test samples. The performance metrics and the percentage of correct classification were obtained by averaging the results obtained for each of the 100 test samples.

n.sel <- 100 # Number of iterations

# We initialize a number of objects to store data
CONTROL <- NULL
graphs <- list()
graphs.ld <- list()
graphs.classif <- list()
resultats <- list()
plot.confusionmatrix <- list()
confusion.matrix <- list()
class2 <- list()
probabilities <- list()
probabilities2 <- list()

# We iterate over our different DV 
for (DV in DVs) {

  # We define the CONTROL variable
  if (DV == "type") {
    CONTROL <- "individual"
  }

  if (DV == "individual") {
    CONTROL <- "type"
  }

  # Number of levels of the dependent variable
  ntf.levels <- length(unique(df[[DV]]))

  # We compute 2/3 of the smallest number of observations we can find in one of the levels of the DV
  n.to.sel <- df %>%
    group_by(DV = get(DV)) %>%
    summarise(n = n() ) %>% 
    slice_min(n, with_ties = FALSE) %>%
    pull() %>%
    {round((2/3)*., digits=0)}

  # We iterate over our different feature sets
  for (i in 1:length(SETs)) {
    
    # We initialize objects
    class <- list()
    confusion <- list()
    ld <- list()
    prop.ld <- list()
    classif <- list()
      
    # We collapse the names of the variable in order to build a formula for our model
    SET2 <- paste(SETs[[i]], collapse = "+")
    
    # We build object for the set under study
    SET <- c(SETs[[i]], DV)
    set_name <- set_names[i]

    # We create a subset of our main data table which contains only the explanatory variable of the set under study, the DV and the control variable
    df.set <- df %>%
      dplyr::select(all_of(SET), all_of(CONTROL)) %>%
      tibble::rownames_to_column(., var = "rowname")
    
    for (k in 1:n.sel) {
      set.seed(k)  # Set the random seed for reproducibility
  
      # We make a random partition of df.set so that we pick n.to.sel elements in each level of the factor DV to assemble a training set
      # The test set is made of the remaining elements
      samples <- stratified(df.set, DV, size = n.to.sel, bothSets = TRUE)

      # Training set
      training <- samples$SAMP1
      training.class <- training %>% pull(all_of(DV))
      training.data <- training %>% dplyr::select(all_of(SETs[[i]]))

      training.prior <- training %>%
        count(get(DV)) %>%
        mutate(prop = prop.table(n)) %>%
        pull() %>%
        as.numeric()
      
      # Test set
      testing <- samples$SAMP2
      testing.class <- testing %>% pull(all_of(DV))
      testing.data <- testing %>% dplyr::select(all_of(SETs[[i]]))

      testing.prior <- testing %>%
        count(get(DV)) %>%
        mutate(prop = prop.table(n)) %>%
        pull() %>%
        as.numeric()    
      
      # we define the DFA model with the trainig set
      model <- paste("lda(", DV, "~", SET2,", prior=training.prior, data=training)", sep="")
      # we compute the model and store the outputs in 'res'
      res <- eval(parse(text=model))

      training.pred <- predict(res)
      training$pred <- training.pred$class

      # We save the discriminant functions
      prop.ld[[k]] <- res$svd^2/sum(res$svd^2)            

      # We save the LD coefficients
      ld[[k]] <- training.pred$x
      
      # We apply the model to the test set
      testing.pred <- predict(res, testing.data, prior=testing.prior)
      testing$pred <- testing.pred$class
      
      # We extract prediction probabilities
      probabilities[[k]] <- as.data.frame(testing.pred$posterior) %>%
        bind_cols(control=testing[[CONTROL]],
                  observed=testing[[DV]],
                  rowname=as.numeric(testing$rowname))
      
      # We compute the negative of the multinomial log-likelihood (smaller is better)
      mnlogloss <- testing %>%
        mutate(observed=as.factor(get(DV))) %>%
        dplyr::select(observed) %>%
        bind_cols(testing.pred$posterior) %>%
        mnLogLoss(., lev = levels(.$observed)) %>% as.numeric()
      
      # We create the confusion matrix for the current iteration
      confusion[[k]] <- table(list(predicted=testing$pred, observed=testing[[DV]]))
      
      # We extract the values of a few metrics
      acc <- t(confusionMatrix(confusion[[k]])$overall[[1]])
      kappa <- t(confusionMatrix(confusion[[k]])$overall[[2]])
      auc <- multiclass.roc(response = testing[[DV]], predictor=testing.pred$posterior)$auc[1]

      # The balanced accuracy in binary and multiclass classification problems is another option to address
      # imbalanced datasets. It is defined as the average of recall obtained on each class.
      # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html
      
      bal <- mean(confusionMatrix(confusion[[k]])[["byClass"]][,1])
      
      # We get the total number of correct classifications in the training set for the current iteration
      training.classif <- training %>%
        mutate(sum = sum(pred == get(DV))) %>%
        distinct(sum) %>%
        pull() %>%
        as.numeric()
      
      # We compute the percentage of correct classification in the training set for the current iteration
      training.pourc.classif <- training.classif / nrow(training)

      # We get the total number of correct classifications in the test set for the current iteration
      testing.classif <- testing %>%
        mutate(sum = sum(pred == get(DV))) %>%
        distinct(sum) %>%
        pull()  %>%
        as.numeric()
  
      # We compute the percentage of correct classification in the test set for the current iteration
      testing.pourc.classif <- testing.classif/(nrow(testing)) 
      
      # We create and store a data frame for the current iteration
      # We save the number of correct classifications and percentage of correct classification for both training and test sets
      # and also the different metrics for the test set
      class[[k]] <- data.frame(DV, k, 
                               n.training=nrow(training), training.classif, training.pourc.classif, 
                               n.testing=nrow(testing), testing.classif, testing.pourc.classif,
                               mnlogloss, kappa, auc, acc, bal)

      # We compute the rates of correct classification by level of the DV and by level of the CONTROL variable
      classif[[k]] <- testing %>%
        group_by(variable=testing[[DV]], control=testing[[CONTROL]]) %>% 
        summarise(
          total = length(variable),
          rate = sum(variable == pred) / total,
          .groups = 'drop')
    }

    # We store the results for the DV and feature set under study

    # 1. We bind the prediction probabilities for all the iterations
    probabilities2[[DV]][[set_name]] <- do.call(rbind, probabilities)

    # 2. We compute the average rates of correct classification for each level of the DV and each level of the CONTROL variable 
    classif2 <- do.call(rbind, classif) %>%
      group_by(variable, control) %>%
      summarize(rate = sum(rate) / n.sel, .groups = 'drop')
    
    if (DV == "type")
      classif2$variable <- factor(classif2$variable, levels = c("SCB", "B", "SB", "PY", "P"))
    
    # We store a ggplot figure displaying the classification performance for the levels of the DV and those of the CONTROL variable
    graphs.classif[[DV]][[set_name]] <- ggplot() +
      geom_point(data = classif2, aes(x = variable, y = rate, color = variable), alpha = 0.65) +
      geom_text_repel(data = classif2, aes(x = variable, y = rate, color = variable, label = control), min.segment.length = 0, seed = 42, box.padding = 0.5, max.overlaps = Inf, alpha=0.65) +
      geom_point(data = classif2 %>% group_by(variable) %>% summarise(rate = mean(rate)), aes(variable, rate, fill = variable), colour="black", pch=21, size = 4, alpha=0.95) +
      labs(x = "", y = "% Correct Classification", title = paste0("Testing dataset - ", DV, " - ", set_name)) +
      theme_classic()
    
    # We sum the values of the LDs for all the iterations and divide them by the number of iterations (n.sel)
    ld2 <- Reduce('+', ld)/n.sel
    ld2 <- data.frame(ld2, variable=training[[DV]])

    # We sum the values of the variances of LDs for all the iterations and divide them by the number of iterations (n.sel)
    prop.ld2 <- Reduce('+', prop.ld)/n.sel

    # We store a ggplot figure displaying the LDs
    graphs.ld[[DV]][[set_name]] <- ggpairs(ld2,
            columns = 1:(length(ld2)-1),
            aes(color = variable),
            upper = "blank", legend=1, 
            lower = list(continuous = "points"), 
            diag = list(continuous = "densityDiag"),
            axisLabels = "show") +
      theme_bw() +
      theme(legend.position = "bottom", 
            legend.title = element_blank()) +
      labs(caption = paste0("Mean LDs: ", paste(round(prop.ld2*100, 2), collapse= "% "), "%"),
          title = paste0("Training dataset - ", DV, " - ", set_name))

    # We bind the clasification data for all the iterations
    class2[[DV]][[set_name]] <- do.call("rbind", class)
    
    # We store a data table containing the various assessments of classification performance
    resultats[[DV]][[set_name]] <- data.frame(
      "DV"=DV,
      "N.Iter"=n.sel,
      "N.Train"=round(mean(class2[[DV]][[set_name]]$n.training), 0),
      "N.Correct.Train"=round(mean(class2[[DV]][[set_name]]$training.classif), 0),
      "Perc.Correct.Train"=round(round(mean(class2[[DV]][[set_name]]$training.classif), 0)/round(mean(class2[[DV]][[set_name]]$n.training), 0)*100, 0),
      "N.Test"=round(mean(class2[[DV]][[set_name]]$n.testing),0),
      "N.Correct.Test"=round(mean(class2[[DV]][[set_name]]$testing.classif), 0),
      "Perc.Correct.Test"=round(round(round(mean(class2[[DV]][[set_name]]$testing.classif), 0)/round(mean(class2[[DV]][[set_name]]$n.testing),0)*100, 0), 0),   
      "Log.Loss"=round(mean(class2[[DV]][[set_name]]$mnlogloss), 3),
      "sd.Log.Loss"=round(sd(class2[[DV]][[set_name]]$mnlogloss), 3),
      "Kappa"=round(mean(class2[[DV]][[set_name]]$kappa), 3),
      "sd.Kappa"=round(sd(class2[[DV]][[set_name]]$kappa), 3),
      "AUC"=round(mean(class2[[DV]][[set_name]]$auc), 3),
      "sd.AUC"=round(sd(class2[[DV]][[set_name]]$auc), 3),
      "Acc"=round(mean(class2[[DV]][[set_name]]$acc), 3),
      "sd.Acc"=round(sd(class2[[DV]][[set_name]]$acc), 3),
      "Bal.Acc"=round(mean(class2[[DV]][[set_name]]$bal), 3),
      "sd.Bal.Acc"=round(sd(class2[[DV]][[set_name]]$bal), 3)
    )
 
    # We store a ggplot figure displaying the percentages of correct classification for the training set
    graphs[[DV]][[set_name]][[1]] <- ggplot(data=class2[[DV]][[set_name]]) +
      geom_point(aes(k, 100 * training.pourc.classif ), color = "darkorange3") +
      geom_smooth(aes(k, 100 * training.pourc.classif ), method = "loess", se = FALSE, color = "black") + # Loess regression
      geom_hline(yintercept = 100/ntf.levels, linetype = "dashed", color = "red") +
      annotate("text", x = 0, y = 100, label = paste("Average correct classification: ", resultats[[DV]][[set_name]]$Perc.Correct.Train,"%"), hjust = 0) +
      labs(x = "Iteration", y = "Percent of correct classification", title = paste0("Training dataset - ", DV, " - ", set_name)) +
      scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 20)) +
      theme_classic()
    
    seuils.ind <- testing %>%
        group_by(get(DV)) %>%
        summarise(n= (n()) ) %>%
        mutate( prop = 100*(n / sum(n)) )
    
    # We store a ggplot figure displaying the percentages of correct classification for the test set
    graphs[[DV]][[set_name]][[2]] <- ggplot(data=class2[[DV]][[set_name]]) +
      geom_point(aes(k, 100*testing.pourc.classif ), color="dodgerblue4") +
      geom_smooth(aes(k, 100*testing.pourc.classif ), method = "loess", se = FALSE, color = "black") + # Loess regression
      annotate("text", x = 0, y = 100, label = paste("Average correct classification: ", resultats[[DV]][[set_name]]$Perc.Correct.Test,"%"), hjust = 0) +
      geom_hline(yintercept = seuils.ind$prop, linetype="dashed", color = "red") +
      geom_text(data=seuils.ind, aes(0, prop, label = `get(DV)`), hjust = 0) +
      labs(x = "Iteration",
           y = "Percent of correct classification",
           title = paste0("Testing dataset - ", DV, " - ", set_name)) +
      scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 20)) +
      theme_classic()

    # We sum cell by cell the content of all the confusion matrices (one per iteration)
    # and we divide the values of the cells of the resulting matrix by the number of iterations (n.sel) to compute average values
    confusion2 <- Reduce('+', confusion)/n.sel
    
    confusion.matrix[[DV]][[set_name]] <- t(confusionMatrix(confusion2)$byClass)

    # We create a figure for the average confusion matrix
    plot.confusionmatrix[[DV]][[set_name]] <- ggplotConfusionMatrix(confusionMatrix(confusion2))

  }
}

saveRDS(class2, file = "Result files/pdfa-permuted-class.rds")
saveRDS(resultats, file = "Result files/pdfa-permuted-resultats01.rds")
saveRDS(probabilities2, file = "Result files/pdfa-permuted-probabilities.rds")

7.1.1 Displaying the diagnostics (Linear Discriminants)

graphs.ld %>%
  walk2(names(.), ~ print(.x ))

$Bioacoustic $DCT $Bioacoustic $DCT

7.1.2 Displaying the result graphs

graphs %>%
  walk2(names(.), ~ print(.x ))

$Bioacoustic $Bioacoustic[[1]] $Bioacoustic[[2]]

$DCT $DCT[[1]] $DCT[[2]]

$Bioacoustic $Bioacoustic[[1]] $Bioacoustic[[2]]

$DCT $DCT[[1]] $DCT[[2]]

7.1.3 Displaying a data table of the results

resultats %>%
  dplyr::bind_rows() %>%
  walk2(names(.), ~ print(kable(.x, caption = paste0("Sets of predictors: ", .y),
                                digits=5,
                                col.names= c("DV", "N Iter.", "N", "N Correct Classif.", "Perc", "N", "N Correct Classif.", "Perc.", "Log Loss", "sd Log Loss", "Kappa", "sd Kappa", "Multi-class AUC", "sd AUC", "Acc.", "sd Acc.", "Bal. Acc.", "sd Bal. Acc.") ) %>%
  kable_styling() %>%
  add_header_above(c(" " = 2, "Original Training Set" =3, "Original Testing Test" = 3, "Evaluation Metrics" = 10)) ))
Sets of predictors: Bioacoustic
Original Training Set
Original Testing Test
Evaluation Metrics
DV N Iter. N N Correct Classif. Perc N N Correct Classif. Perc. Log Loss sd Log Loss Kappa sd Kappa Multi-class AUC sd AUC Acc. sd Acc. Bal. Acc. sd Bal. Acc.
individual 100 470 172 37 1090 447 41 1.745 0.042 0.267 0.016 0.731 0.008 0.410 0.015 0.236 0.012
type 100 685 452 66 875 570 65 0.819 0.025 0.526 0.021 0.894 0.006 0.652 0.015 0.588 0.018
Sets of predictors: DCT
Original Training Set
Original Testing Test
Evaluation Metrics
DV N Iter. N N Correct Classif. Perc N N Correct Classif. Perc. Log Loss sd Log Loss Kappa sd Kappa Multi-class AUC sd AUC Acc. sd Acc. Bal. Acc. sd Bal. Acc.
individual 100 470 155 33 1090 468 43 1.698 0.056 0.288 0.016 0.731 0.009 0.430 0.016 0.235 0.010
type 100 685 464 68 875 590 67 0.813 0.020 0.554 0.018 0.901 0.006 0.675 0.013 0.596 0.017

7.1.4 Displaying the confusion Matrices

plot.confusionmatrix %>%
  walk2(names(.), ~ print(.x ))

$Bioacoustic $DCT $Bioacoustic $DCT

7.1.5 Displaying the % of correct classification per level of the DV and per level of the control variable

graphs.classif %>%
  walk2(names(.), ~ print(.x ))

$Bioacoustic $DCT $Bioacoustic $DCT

7.2 Building a baseline to assess the significance of the average performance

“In addition to the cross-validated DFAs performed on original datasets, new data sets were also created, where the identity of sounds was randomly permuted between individuals (pDFA), to obtain the statistical significance of the mean effect size. From these randomized sets, the same steps,fitting and validation, were performed consecutively. After 1000 iterations, we calculated the proportion of randomized validation data sets revealing a number of correctly classified calls being at least as large as the effect size obtained with the non randomized validation data set. This proportion gives the significance of the discrimination level and is equivalent to a Pvalue (Dentressangle et al., 2012 ; Mathevon et al., 2010 ; Mundry & Sommer, 2007).” — Keenan et al. (2020)

In order to statistically compare the performance reached by discriminant analysis to the chance level, additional discriminant analyses were also performed on modified datasets created by recombination, following the permuted DFA paradigm. Recombination consists in assigning a class label (call type or individual identity, depending on the task considered) to an observation by a random permutation over the dataset. Training and test sets were then drawn following the same rules as for the original dataset, and a DFA was applied, leading to an accuracy characteristic of a random classification. This procedure was iterated 1,000 times, resulting in a robust estimation of the distribution of random accuracies. Empirical p-value of the performance reached with the original discriminant analysis was then obtained by dividing the number of randomized datasets that revealed a percentage of correctly classified calls or individuals at least as large as the one obtained for the original dataset by the total number of datasets tested (Mundry & Sommer, 2007 ; Keenan et al., 2020).

n.perm <- 1000 # Number of permutations

p.train <- 1
p.test <- 1
resultats2 <- list()
perm.class2 <- list()
graphs.permutations <- list()

for (DV in DVs) {
  ntf.levels <- length(unique(df[[DV]]))

  n.to.sel <- df %>%
  group_by(DV = get(DV)) %>%
  summarise(n= n() ) %>% 
  slice_min(n, with_ties = FALSE) %>%
  pull() %>%
  {round((2/3)*.,digits=0)}

  for (i in 1:length(SETs)) {

    perm.class <- list()
    perm.confusion <- list()

    APPEL_MODEL <- paste(SETs[[i]], collapse = "+")
    SET <- c(SETs[[i]], DV)
    set_name <- set_names[i]

    df.set.ini <- df %>% dplyr::select(all_of(SET))
    
    for (k in 1:n.perm) {
    
      set.seed(k)
      
      # we shuffle the values of DV
      df.set <- slice(df.set.ini, sample(1:n()))
      df.set[[DV]] <- df.set.ini[[DV]]
      
      echantillons <- stratified(df.set, DV, size = n.to.sel, bothSets = TRUE)

      training <- echantillons$SAMP1
      
      training.class <- training %>% pull(all_of(DV))
      
      training.data <- training %>% dplyr::select(all_of(SETs[[i]]))

      training.prior <- training %>%
        count(get(DV)) %>%
        mutate(prop = prop.table(n)) %>%
        pull() %>%
        as.numeric()                  

      testing <- echantillons$SAMP2
      
      testing.class <- testing %>% pull(all_of(DV))
      
      testing.data <- testing %>% dplyr::select(all_of(SETs[[i]]))
      
      testing.prior <- testing %>%
        count(get(DV)) %>%
        mutate(prop = prop.table(n)) %>%
        pull() %>%
        as.numeric() 
      
      model <- paste("lda(", DV, "~", APPEL_MODEL,", prior=training.prior, data=training)", sep="")
      
      res <- eval(parse(text=model))
      
      training.pred <- predict(res)
      training$pred <- training.pred$class

      testing.pred <- predict(res, testing.data, prior = testing.prior)
      testing$pred <- testing.pred$class
      
      probabilities <- as.data.frame(testing.pred$posterior)

      perm.confusion[[k]] <- table(list(predicted=testing$pred, observed=testing[[DV]]))
      
      perm.training.classif <- training %>%
      mutate(sum = sum(pred==get(DV))) %>%
        distinct(sum) %>%
        pull() %>%
        as.numeric()

      perm.training.pourc.classif <- perm.training.classif / (nrow(training))
      
      perm.testing.classif <- testing %>%
        mutate(sum = sum(pred==get(DV))) %>%
        distinct(sum) %>%
        pull() %>%
        as.numeric()    
       
      perm.testing.pourc.classif <- perm.testing.classif / (nrow(testing))
      
      perm.class[[k]] <- data.frame(k, perm.n.training=nrow(training), perm.training.classif, perm.training.pourc.classif, perm.n.testing=nrow(testing), perm.testing.classif, perm.testing.pourc.classif)
       
      if (perm.training.classif >= resultats[[DV]][[set_name]]$N.Correct.Train)
        p.train = p.train + 1
      
      if (perm.testing.classif >= resultats[[DV]][[set_name]]$N.Correct.Test)
        p.test = p.test + 1
      
    }
    
    perm.class2[[DV]][[set_name]] <- do.call("rbind", perm.class)
    
    resultats2[[DV]][[set_name]] <- data.frame(
      "DV"=DV,
      "N.Iter"=n.perm,
      "N.Train"=round(mean(perm.class2[[DV]][[set_name]]$perm.n.training), 0),
      "N.Correct.Train"=round(mean(perm.class2[[DV]][[set_name]]$perm.training.classif), 0),
      "Perc.Correct.Train"=round(round(mean(perm.class2[[DV]][[set_name]]$perm.training.classif), 0)/round(mean(perm.class2[[DV]][[set_name]]$perm.n.training), 0)*100, 0),
      "P.Value.Train"=p.train/n.perm,
      "N.Test"=round(mean(perm.class2[[DV]][[set_name]]$perm.n.testing),0),
      "N.Correct.Test"=round(mean(perm.class2[[DV]][[set_name]]$perm.testing.classif), 0),
      "Perc.Correct.Test"=round(round(mean(perm.class2[[DV]][[set_name]]$perm.testing.classif), 0)/round(mean(perm.class2[[DV]][[set_name]]$perm.n.testing),0)*100, 0),             
      "P.Value.Test"=p.test/n.perm
    )

    graphs.permutations[[DV]][[set_name]][[1]] <- ggplot(data=perm.class2[[DV]][[set_name]]) +
      geom_point(aes(k, 100*perm.training.pourc.classif ), color="darkorange3") +
      geom_smooth(aes(k, 100*perm.training.pourc.classif ), method = "loess", se = FALSE, color = "black") +
      geom_hline(yintercept=100/ntf.levels, linetype="dashed", color = "red") +
      annotate("text", x = 0, y = 100, label = paste("Average correct classification: ", resultats2[[DV]][[set_name]]$Perc.Correct.Train, "%"), hjust = 0) +
      labs(x = "Iteration",
           y = "Percent of correct classification",
           title = paste0("Training permuted dataset - ", DV, " - ", set_name)) +
      scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 20)) +
      theme_classic()
    
    seuils.ind <- testing %>%
      group_by(get(DV)) %>%
      summarise(n= (n()) ) %>%
      mutate( prop = 100*(n / sum(n)) )
    
    graphs.permutations[[DV]][[set_name]][[2]] <- ggplot(data=perm.class2[[DV]][[set_name]]) +
      geom_point(aes(k, 100*perm.testing.pourc.classif ), color="dodgerblue4") +
      geom_smooth(aes(k, 100*perm.testing.pourc.classif ), method = "loess", se = FALSE, color = "black") + # régression de loess
      annotate("text", x = 0, y = 100, label = paste("Average correct classification: ", resultats2[[DV]][[set_name]]$Perc.Correct.Test, "%"), hjust = 0) +
      geom_hline(yintercept = seuils.ind$prop, linetype="dashed", color = "red") +
      geom_text(data=seuils.ind, aes(0, prop, label = `get(DV)`), hjust = 0) +
      labs(x = "Iteration",
           y = "Percent of correct classification",
           title = paste0("Testing permuted dataset - ", DV, " - ", set_name)) +
      scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 20)) +
      theme_classic()
  }
}

We save the results in .RDS files:

saveRDS(perm.class2, file = "Result files/pdfa-permuted-perm.class.rds")
saveRDS(resultats2, file = "Result files/pdfa-permuted-resultats02.rds")

7.2.1 Displaying the result graphs

graphs.permutations %>%
  walk2(names(.), ~ print(.x ))

$Bioacoustic $Bioacoustic[[1]] $Bioacoustic[[2]]

$DCT $DCT[[1]] $DCT[[2]]

$Bioacoustic $Bioacoustic[[1]] $Bioacoustic[[2]]

$DCT $DCT[[1]] $DCT[[2]]

7.2.2 Displaying the results in a table

resultats2 %>%
  dplyr::bind_rows()%>%
  walk2(names(.), ~ print(kable(.x, caption = paste0("Sets of predictors: ", .y),
                                digits=5,
                                col.names= c("DV", "N Iter.", "N", "N Correct Classif.", "Perc.", "P value", "N", "N Correct Classif.", "Perc.", "P value") ) %>%
  kable_styling() %>%
  add_header_above(c(" " = 2, "Randomized Training Set" = 4, "Randomized Testing Test" = 4)) ))
Sets of predictors: Bioacoustic
Randomized Training Set
Randomized Testing Test
DV N Iter. N N Correct Classif. Perc. P value N N Correct Classif. Perc. P value
individual 1000 470 89 19 0.001 1090 268 25 0.001
type 1000 685 185 27 0.001 875 287 33 0.001
Sets of predictors: DCT
Randomized Training Set
Randomized Testing Test
DV N Iter. N N Correct Classif. Perc. P value N N Correct Classif. Perc. P value
individual 1000 470 77 16 0.001 1090 285 26 0.001
type 1000 685 172 25 0.001 875 293 33 0.001

8 Environment

R version 4.1.3 (2022-03-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
OS version: Windows 10 x64 (build 22000)

9 Packages

stats: The R Stats Package version 4.1.3
graphics: The R Graphics Package version 4.1.3
grDevices: The R Graphics Devices and Support for Colours and Fonts version 4.1.3
utils: The R Utils Package version 4.1.3
datasets: The R Datasets Package version 4.1.3
methods: Formal Methods and Classes version 4.1.3
base: The R Base Package version 4.1.3
fmsb: Functions for Medical Statistics Book with some Demographic Data version 0.7.3
splitstackshape: Stack and Reshape Datasets After Splitting Concatenated Values version 1.4.8
GGally: Extension to ‘ggplot2’ version 2.1.2
ggrepel: Automatically Position Non-Overlapping Text Labels with ‘ggplot2’ version 0.9.1
pROC: Display and Analyze ROC Curves version 1.18.0
kableExtra: Construct Complex Table with ‘kable’ and Pipe Syntax version 1.3.4
caret: Classification and Regression Training version 6.0-90
lattice: Trellis Graphics for R version 0.20-45
ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics version 3.3.5
MASS: Support Functions and Datasets for Venables and Ripley’s MASS version 7.3-55
forcats: Tools for Working with Categorical Variables (Factors) version 0.5.1
purrr: Functional Programming Tools version 0.3.4
dplyr: A Grammar of Data Manipulation version 1.0.8
knitr: A General-Purpose Package for Dynamic Report Generation in R version 1.37


  1. Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎

  2. Université de Lyon, CNRS, Laboratoire Dynamique Du Langage, UMR 5596, Lyon, France↩︎

  3. ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎

  4. Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎

  5. ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎

  6. ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎

  7. Department of Linguistics, The University of Hong Kong, Hong Kong, China, Corresponding author↩︎